This notebook loads an MDT dataset, visualizes it using t-SNE and uses DBSCAN to detect clusters and associated state transitions ("change-points").
%load_ext autoreload
%autoreload 2
import modules.dataset as ds
ds.extract_dataset('./datasets/mdt-demo.tgz', './output')
import modules.mdt.datasets as mdt_ds
datasets = mdt_ds.Datasets(datasets_dir='./output')
datasets.jupyter_select_dataset_device(select_file=False)
Box(children=(Dropdown(description='Dataset:', layout=Layout(display='flex', justify_content='flex-start', wid…
import pandas as pd
import modules.utils as utils
data_fn, _ = datasets.get_input_data_file("preprocessed_offline.csv")
df = pd.read_csv(open(data_fn, 'rb'))
utils.displayDataFrame(df.iloc[0:19,0:9])
| ts | n0:Cisco-IOS-XR-drivers-media-eth-oper:ethernet-interface_statistics_statistic.csv:HundredGigE0/0/0/0:received-good-bytes | n0:Cisco-IOS-XR-drivers-media-eth-oper:ethernet-interface_statistics_statistic.csv:HundredGigE0/0/0/0:received-good-frames | n0:Cisco-IOS-XR-drivers-media-eth-oper:ethernet-interface_statistics_statistic.csv:HundredGigE0/0/0/0:received-multicast-frames | n0:Cisco-IOS-XR-drivers-media-eth-oper:ethernet-interface_statistics_statistic.csv:HundredGigE0/0/0/0:received-total-bytes | n0:Cisco-IOS-XR-drivers-media-eth-oper:ethernet-interface_statistics_statistic.csv:HundredGigE0/0/0/0:received-total-frames | n0:Cisco-IOS-XR-drivers-media-eth-oper:ethernet-interface_statistics_statistic.csv:HundredGigE0/0/0/0:received-total-octet-frames-from1024-to1518 | n0:Cisco-IOS-XR-drivers-media-eth-oper:ethernet-interface_statistics_statistic.csv:HundredGigE0/0/0/0:received-total-octet-frames-from128-to255 | n0:Cisco-IOS-XR-drivers-media-eth-oper:ethernet-interface_statistics_statistic.csv:HundredGigE0/0/0/0:received-total-octet-frames-from1519-to-max |
|---|---|---|---|---|---|---|---|---|
| 1558249381.658611 | 0.681327 | 0.687531 | 0.504115 | 0.681327 | 0.687531 | 0.451305 | 0.858389 | 0.681585 |
| 1558249391.658611 | 0.681327 | 0.687531 | 0.504115 | 0.681327 | 0.687531 | 0.451305 | 0.858389 | 0.681585 |
| 1558249401.658611 | 0.644663 | 0.648323 | 0.258243 | 0.644663 | 0.648323 | 0.517278 | 0.741312 | 0.644928 |
| 1558249411.658611 | 0.626289 | 0.628532 | 0.129121 | 0.626289 | 0.628532 | 0.558469 | 0.677508 | 0.626523 |
| 1558249421.658611 | 0.616965 | 0.618757 | 0.294610 | 0.616965 | 0.618757 | 0.586249 | 0.653254 | 0.617207 |
| 1558249431.658611 | 0.608525 | 0.610206 | 0.169590 | 0.608525 | 0.610206 | 0.606070 | 0.636611 | 0.608696 |
| 1558249441.658611 | 0.607697 | 0.609107 | 0.314231 | 0.607697 | 0.609107 | 0.637078 | 0.624820 | 0.607856 |
| 1558249451.658611 | 0.605199 | 0.606604 | 0.409198 | 0.605199 | 0.606604 | 0.633205 | 0.620602 | 0.605310 |
| 1558249461.658611 | 0.617882 | 0.619045 | 0.227750 | 0.617882 | 0.619045 | 0.648154 | 0.627273 | 0.618074 |
| 1558249471.658611 | 0.604494 | 0.605476 | 0.113875 | 0.604494 | 0.605476 | 0.626065 | 0.611651 | 0.604676 |
| 1558249481.658611 | 0.606327 | 0.607145 | 0.285342 | 0.606327 | 0.607145 | 0.636013 | 0.609389 | 0.606486 |
| 1558249491.658611 | 0.604294 | 0.604613 | 0.166601 | 0.604294 | 0.604613 | 0.650453 | 0.591848 | 0.604305 |
| 1558249501.658611 | 0.581812 | 0.580794 | 0.311120 | 0.581812 | 0.580794 | 0.604260 | 0.516707 | 0.581356 |
| 1558249511.658611 | 0.586338 | 0.585610 | 0.407645 | 0.586338 | 0.585610 | 0.604076 | 0.529965 | 0.585930 |
| 1558249521.658611 | 0.597648 | 0.597462 | 0.228587 | 0.597648 | 0.597462 | 0.608988 | 0.568426 | 0.597505 |
| 1558249531.658611 | 0.617345 | 0.617636 | 0.114293 | 0.617345 | 0.617636 | 0.640904 | 0.606600 | 0.617392 |
| 1558249541.658611 | 0.606407 | 0.606873 | 0.282862 | 0.606407 | 0.606873 | 0.631316 | 0.605719 | 0.606531 |
| 1558249551.658611 | 0.607281 | 0.607856 | 0.168050 | 0.607281 | 0.607856 | 0.649228 | 0.612613 | 0.607421 |
| 1558249561.658611 | 0.612705 | 0.613097 | 0.309156 | 0.612705 | 0.613097 | 0.654539 | 0.613460 | 0.612936 |
import re
from datetime import datetime, timezone
import numpy as np
MIN_TIMESTAMP = -62135596800
MAX_TIMESTAMP = 253402214400
ORIGINAL_DATA = "original data"
REDUCED_DATA = "reduced data"
FIRST_DERIVATIVE = "first derivative"
SECOND_DERIVATIVE = "second derivative"
def get_feature_names_bis(path, delimiter=','):
"a more direct and simpler implementation than get_feature_names()"
with open(path, "r") as f:
header = f.readline().strip('\n')
return header.split(delimiter)
def scale_data(d):
d = d - np.mean(d, axis=0)
ft_scale = np.std(d, axis=0)
z_index = np.where(ft_scale < 1e-6)
ft_scale[z_index] = 1
d = d / ft_scale
return d
def load_data(in_fn, reduced=None, startTime=MIN_TIMESTAMP, endTime=MAX_TIMESTAMP,
scale=False, data_selection={}, ft_regex=None, remove_nan=False, remove_inf=False) -> (np.array, pd.DataFrame):
data = np.genfromtxt(in_fn, dtype=float, delimiter=',', skip_header=1)
if isinstance(data_selection, str):
selection = {
ORIGINAL_DATA : False,
REDUCED_DATA : False,
FIRST_DERIVATIVE : False,
SECOND_DERIVATIVE: False
}
selection[data_selection] = True
data_selection = selection
tstp = data[:,0]
data = data[:,1:]
ft_names = np.asarray(get_feature_names_bis(in_fn)[1:])
if ft_regex:
ft_filter = re.compile(ft_regex, re.IGNORECASE)
ft_idx = np.array([i for i, v in enumerate(map(ft_filter.match, ft_names)) if v is not None])
if len(ft_idx) > 0:
data = data[:, ft_idx]
ft_names = ft_names[ft_idx]
else:
data = np.array([])
ft_names = np.array([])
if remove_nan:
inval_col = np.where(np.any(np.isnan(data), axis=0))
data = np.delete(data, inval_col, axis=1)
ft_names = np.delete(ft_names, inval_col)
if remove_inf:
inval_col = np.where(np.any(np.isinf(data), axis=0))
data = np.delete(data, inval_col, axis=1)
ft_names = np.delete(ft_names, inval_col)
if scale:
data = scale_data(data)
final_names = np.asarray([])
final_data = np.array([[] for _ in range(len(data))])
derivative = None
if data_selection[FIRST_DERIVATIVE] or data_selection[SECOND_DERIVATIVE]:
derivative = np.diff(data, axis=0)
if data_selection[ORIGINAL_DATA]:
final_data = np.append(final_data, data, axis=1)
final_names = np.append(final_names, ft_names)
if data_selection[REDUCED_DATA]:
final_data = np.append(final_data, reduced, axis=1)
final_names = np.append(final_names, [f"{x}_bytes-sent_reduced" for x in range(len(reduced[0]))])
if data_selection[FIRST_DERIVATIVE]:
final_data = np.append(final_data, np.vstack([derivative[0,:], derivative]), axis=1)
final_names = np.append(final_names, [f"{x}_bytes-send_deriv" for x in ft_names])
if data_selection[SECOND_DERIVATIVE]:
second_derivative = np.diff(derivative, axis=0)
second_derivative = np.vstack([second_derivative[0,:], second_derivative[0,:], second_derivative])
final_data = np.append(final_data, second_derivative, axis=1)
final_names = np.append(final_names, [f"{x}_bytes-sent_deriv2" for x in ft_names])
# add timestamp
final_data = np.append(tstp.reshape(-1,1), final_data, axis=1)
final_names = np.append(np.asarray('ts'), final_names)
# filter by time
if isinstance(startTime, datetime):
startTime = startTime.replace(tzinfo=timezone.utc).timestamp()
if isinstance(endTime, datetime):
endTime = endTime.replace(tzinfo=timezone.utc).timestamp()
final_data = final_data[
(final_data[:,0] >= startTime) &
(final_data[:,0] <= endTime)
]
final_tstp = final_data[:,0]
return final_tstp, pd.DataFrame(final_data, columns=final_names)
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import MinMaxScaler
from sklearn.manifold import TSNE
max_data_point_distance = 0.05
tstp, dataframe = load_data(data_fn, scale=False, data_selection=ORIGINAL_DATA, ft_regex="^(?!.*(time|second)).*")
fulldata = dataframe.to_numpy(dtype=float)
tstp = fulldata[:,0]
data = fulldata[:,1:]
solver = TSNE(n_components=2, init='pca', random_state=0)
reduced = solver.fit_transform(data)
solver = DBSCAN(eps = max_data_point_distance)
clusters = solver.fit(MinMaxScaler().fit_transform(reduced)).labels_
changes = np.where(clusters[:-1] != clusters[1:])[0]
changepoints = []
for t in changes:
changepoints.append(tstp[t])
print(changepoints)
[1558250581.658611, 1558251821.658611, 1558253001.658611, 1558254201.658611, 1558255381.658611, 1558256611.658611, 1558257801.658611, 1558258991.658611]
from modules.mdt.data_utils import plot_data_anime
import plotly.graph_objects as go
events = [
{
"timestamp": (tstp[t+1] + tstp[t])/2.0,
"event": str(i+1),
"device": datasets.get_device(),
"interface": None
} for i, t in enumerate(changes)]
plot_data, frames = plot_data_anime(reduced, tstp, events, color='rgb(128,177,211)')
fig = go.Figure(
data = plot_data,
layout = {
'title': "tSNE 2-D Visualization",
'autosize': False,
'width': 1000,
'height': 1000,
'updatemenus': [{
'buttons': [
{
'args': [None, {
'frame': {'duration': 100, 'redraw': False},
'fromcurrent': True, 'transition': {'duration': 50, 'easing': 'quadratic-in-out'}}],
'label': 'Go', 'method': 'animate'
},
{
'args': [[None], {'frame': {'duration': 0, 'redraw': False}, 'mode': 'immediate',
'transition': {'duration': 0}}],
'label': 'Pause',
'method': 'animate'
}],
'direction': 'left',
'pad': {'r': 10, 't': 10},
'showactive': False,
'type': 'buttons',
'x': 0.1,
'xanchor': 'right',
'y': 1,
'yanchor': 'bottom'
}]},
frames = frames)
fig.show()